R coding to calculate standardised rates and ratios from cancer registry data
Author
Anshu Uppal
Published
April 28, 2025
Code
# install.packages("pacman")pacman::p_load( here, tidyverse,# Rcan, # package for Cancer Registry Data Analysis and Visualisation PHEindicatormethods, # package for standardising rates# sf, # for spatial functions DT # package for table formatting and styling)# Read in the pre-processed data -------------------------------------# source(here("code", "01_prep.R"))# Most recent data, from CI5-XIIswiss_XII <-readRDS(here("data", "generated_data", "swiss_XII.rds"))# National data, from CI5-XIIswiss_ref_XII <-readRDS(here("data", "generated_data", "swiss_ref_XII.rds"))# Annual data, from CI5-I to CI5-XIIswiss_plus <-readRDS(here("data", "generated_data", "swiss_plus.rds"))# # Canton polygons data# canton_boundaries <- readRDS(here("data", "generated_data", "canton_boundaries.rds"))
Code
/* set DT table fontsizes */th { font-size:11px; } /* header font */td { font-size:11px; } /* cell font */
CI5 XII (2013-2017)
This section uses data from Volume XII of the IARC’s “Cancer Incidence in Five Continents (CI5)” collaboration: https://ci5.iarc.fr/ci5-xii/download. The full dataset was then restricted to data from Switzerland-based cancer registries only.
The data pre-processing code can be found in the 01_prep.R file on the GitHub repository for this page.
Direct standardisation for age-standardised rates
Note
When using the calculate_dsr() function from the PHEindicatormethods package, when the cases within the strata are less than 10 the DSR is suppressed (“When the total count is < 10 DSRs are not reliable and will therefore be suppressed in the output”).
# "When the total count is < 10 DSRs are not reliable and will therefore be suppressed in the output." (from the `PHEindicatormethods::calculate_dsr()` funtion)## European Standard Populationdsr_swiss_ESP <- swiss_XII |># filter(# cancer_label %in% c("Colon")# , id_region %in% c("Geneva", "Vaud")# ) |> # Add grouping variables of interestgroup_by(period, cancer_label, id_region, sex) |> PHEindicatormethods::calculate_dsr(x = cases, # field containing count of eventsn = py, # field containing population denominatorsstdpop = EuropeanStandardPopulation, # field containing standard populationstype ="standard" ) |>mutate(crude_rate = total_count/total_pop*100000) |>relocate(crude_rate, .before = value) |>rename(cases = total_count, ASRate = value, lower_CI = lowercl, upper_CI = uppercl) |>mutate(across(c(crude_rate:upper_CI), ~round(.x, 2)))
dsr_swiss_ESP |>filter(cancer_label %in%c("All sites", "All sites but skin", "Other skin")) |>ggplot(aes(x = ASRate, y = id_region, color = sex, fill = sex))+geom_col(position =position_dodge(width =0.4))+theme_bw()+labs(x ="Age-standardised incidence rate, per 100,000", y =NULL,color =NULL, fill =NULL)+geom_errorbarh(aes(xmin = lower_CI, xmax = upper_CI),height =0.4, linewidth =0.5,position =position_dodge(width =0.4))+facet_wrap(.~cancer_label, labeller =label_wrap_gen(),ncol =5)+theme(legend.position ="top")
Code
dsr_swiss_ESP |>filter(!cancer_label %in%c("All sites", "All sites but skin", "Other skin")) |># filter(cancer_label %in% c("Breast", "Colon", "Adrenal gland")) |>ggplot(aes(x = ASRate, y = id_region, color = sex, fill = sex))+geom_col(position =position_dodge(width =0.4))+theme_bw()+labs(x ="Age-standardised incidence rate, per 100,000", y =NULL,color =NULL, fill =NULL)+geom_errorbarh(aes(xmin = lower_CI, xmax = upper_CI),height =0.4, linewidth =0.5,position =position_dodge(width =0.4))+facet_wrap(.~cancer_label, labeller =label_wrap_gen(),ncol =5)+theme(legend.position ="top")
Code
dsr_swiss_ESP |> DT::datatable(filter ="top",options =list(pageLength =26 ),rownames =FALSE, # set to FALSE for cleaner lookclass ='cell-border stripe hover nowrap compact' )
Code
# "When the total count is < 10 DSRs are not reliable and will therefore be suppressed in the output." (from the `PHEindicatormethods::calculate_dsr()` funtion)## European Standard Populationdsr_swiss_WSP <- swiss_XII |># filter(# cancer_label %in% c("Colon")# , id_region %in% c("Geneva", "Vaud")# ) |> # Add grouping variables of interestgroup_by(period, cancer_label, id_region, sex) |> PHEindicatormethods::calculate_dsr(x = cases, # field containing count of eventsn = py, # field containing population denominatorsstdpop = WorldStandardPopulation, # field containing standard populationstype ="standard" ) |>mutate(crude_rate = total_count/total_pop*100000) |>relocate(crude_rate, .before = value) |>rename(cases = total_count, ASRate = value, lower_CI = lowercl, upper_CI = uppercl) |>mutate(across(c(crude_rate:upper_CI), ~round(.x, 2)))
age_crude |># Remove the text " years" while keeping the factor structuremutate(age_ESP =fct_relabel(age_ESP, ~str_remove(.x, " years"))) |>filter(id_region %in%c("Geneva")) |># filter(cancer_label %in% c("All sites", "Colon", "Breast")) |> ggplot(aes(x=age_ESP, y = crude_rate, color = sex, group = sex))+geom_path()+facet_wrap(.~cancer_label, scales ="free_y",labeller =label_wrap_gen(),ncol =6 )+labs(y ="Crude incidence rate per 100,000", x ="Age group", color =NULL)+theme_bw()+# Show only every third label on x-axisscale_x_discrete(breaks = \(x)x[seq(3, length(x), by =3)])+theme(axis.text.x =element_text(angle =50, vjust =0.8),legend.position ="top")
Code
age_crude |># Remove the text " years" while keeping the factor structuremutate(age_ESP =fct_relabel(age_ESP, ~str_remove(.x, " years"))) |>filter(id_region %in%c("National")) |># filter(cancer_label %in% c("All sites", "Colon", "Breast")) |>ggplot(aes(x=age_ESP, y = crude_rate, color = sex, group = sex))+geom_path()+facet_wrap(.~cancer_label, scales ="free_y",labeller =label_wrap_gen(),ncol =6 )+labs(y ="Crude incidence rate per 100,000", x ="Age group", color =NULL)+theme_bw()+# Show only every third label on x-axisscale_x_discrete(breaks = \(x)x[seq(3, length(x), by =3)])+theme(axis.text.x =element_text(angle =50, vjust =0.8),legend.position ="top")
# Format it as a tableage_crude |>filter(sex =="Male") |>DT::datatable(filter ="top",options =list(pageLength =18 ),rownames =FALSE, # set to FALSE for cleaner lookclass ='cell-border stripe hover nowrap compact')
Code
age_crude |>filter(sex =="Female") |> DT::datatable(filter ="top",options =list(pageLength =18 ),rownames =FALSE, # set to FALSE for cleaner lookclass ='cell-border stripe hover nowrap compact' )
Indirect standardisation
During the pre-processing, I aggregated the data to the level of the whole of Switzerland, per cancer type and summing the cases and person-years by age and sex. These national reference cases and person-years are then used below to calculate indirectly standardised rates and ratios for each canton for each cancer type, by sex.
Indirectly standardised rates
Code
ind.rate <- swiss_XII |>group_by(cancer_label, sex, id_region) |> PHEindicatormethods::calculate_ISRate(x = cases,n = py,x_ref = cases_ref, # national cases as referencen_ref = py_ref, # national py as referencerefpoptype ="field" ) |>rename(ISRate = value, lower_CI = lowercl, upper_CI = uppercl) |>mutate(across(c(expected:upper_CI), ~round(.x, 2)))# Generate the tableind.rate |>select(-c(confidence:method)) |> DT::datatable(caption ="Indirectly standardised rate per 100,000, with 95% CI",filter ="top",options =list(pageLength =14 ),rownames =FALSE, # set to FALSE for cleaner lookclass ='cell-border stripe hover nowrap compact' )
Indirectly standardised ratios
Code
ind.ratio <- swiss_XII |>group_by(cancer_label, sex, id_region) |> PHEindicatormethods::calculate_ISRatio(x = cases,n = py,x_ref = cases_ref, # national cases as referencen_ref = py_ref, # national py as referencerefpoptype ="field" ) |>rename(ISRatio = value, lower_CI = lowercl, upper_CI = uppercl) |>mutate(across(c(expected:upper_CI), ~round(.x, 2)))
ind.ratio |>filter(cancer_label %in%c("All sites", "Colon", "Rectum","Bladder", "Prostate", "Breast","Bone", "Vulva", "Lung (incl. trachea and bronchus)" )) |>mutate(est_ci_label =sprintf("%.2f [%.2f-%.2f]", ISRatio, lower_CI, upper_CI) ) |># arrange(id_region, sex) |> # Start the plotggplot(aes(x = ISRatio, y = id_region, color = sex))+# Add CIgeom_errorbarh(aes(xmin = lower_CI, xmax = upper_CI), height =0.4, linewidth =0.5,position =position_dodge(width =0.5) )+# Add point estimategeom_point(size =1.5, position =position_dodge(width =0.5))+# Add vertical reference line at ISRatio = 1geom_vline(xintercept =1, linetype ="dashed", color ="grey50")+# Separate plots by cancer typefacet_wrap(~ cancer_label, labeller =label_wrap_gen(),ncol =3# scales = "free_x" # Use free_y so regions aren't duplicated across facets )+# Add labels and titlelabs(title ="Indirectly Standardized Ratio by Canton and Sex", subtitle ="(reference: national data for Switzerland)",x ="IS Ratio (95% CI)",y =NULL,color =NULL# Legend title ) +# Use a clean themetheme_bw(base_size =12) +theme(panel.grid.major.x =element_blank(),panel.grid.minor.x =element_blank(),# strip.text = element_text(face = "bold"), # Make facet titles boldaxis.text.y =element_text(size =12), # Adjust y-axis text size if needed# panel.spacing.y = unit(1, "lines"), # Add space between facetslegend.position ="top"# Position legend )
Code
ind.ratio |>filter(id_region %in%c("Geneva")) |>mutate(est_ci_label =sprintf("%.2f [%.2f-%.2f]", ISRatio, lower_CI, upper_CI) ) |># arrange(id_region, sex) |> # Start the plotggplot(aes(x = ISRatio, y = cancer_label, color = sex))+# Add CIgeom_errorbarh(aes(xmin = lower_CI, xmax = upper_CI), height =0.4, linewidth =0.5,position =position_dodge(width =0.5) )+# Add point estimategeom_point(size =1.5, position =position_dodge(width =0.5))+# Add vertical reference line at ISRatio = 1geom_vline(xintercept =1, linetype ="dashed", color ="grey50")+# Separate plots by cancer typefacet_wrap(~ id_region, labeller =label_wrap_gen(),ncol =3# scales = "free_x" # Use free_y so regions aren't duplicated across facets )+# Add labels and titlelabs(title ="Indirectly Standardized Ratio by Canton and Sex", subtitle ="(reference: national data for Switzerland)",x ="IS Ratio (95% CI)",y =NULL,color =NULL# Legend title ) +# Use a clean themetheme_bw(base_size =12) +theme(panel.grid.major.x =element_blank(),panel.grid.minor.x =element_blank(),# strip.text = element_text(face = "bold"), # Make facet titles boldaxis.text.y =element_text(size =12), # Adjust y-axis text size if needed# panel.spacing.y = unit(1, "lines"), # Add space between facetslegend.position ="top"# Position legend )
Code
# Generate the tableind.ratio |>select(-c(confidence:method)) |> DT::datatable(caption ="indirectly standardised ratio x 1, with 95% CI",filter ="top",options =list(pageLength =13 ),rownames =FALSE, # set to FALSE for cleaner lookclass ='cell-border stripe hover nowrap compact' )
CI5 I-XII: Geneva (annual trends 1973-2017)
This section uses annual data from Volumes I to XII of the IARC’s “Cancer Incidence in Five Continents (CI5)” collaboration: https://ci5.iarc.fr/ci5plus/download. The full dataset was then restricted to data from Switzerland-based cancer registries only.
The data pre-processing code can be found in the 01_prep.R file on the GitHub repository for this page.
Direct standardisation, using either the European or the World Standard Populations, was used to calculate age-standardised rates for each year.